from pathlib import Path
from itertools import islice, product
from pprint import pprint
import dataclasses
import pandas as pd
import numpy as np
import numpy.typing as npt
from scipy import linalg
import seaborn as sns
import matplotlib.pyplot as plt
# BUG sous linux avec Python 3.10
# import holoviews as hv
# We do it by ourselves
# import prince
import biblio_extractor as bex
np.set_printoptions(precision=4, suppress=True)
pd.set_option("display.float_format", lambda x: "{:.3f}".format(x))
pd.set_option("display.max_columns", 10)
pd.set_option("display.max_rows", 20)
pd.set_option("display.min_rows", 10)
# (11.7, 8.27) = A4 landscape
sns.set_theme(style="dark", palette="muted", font_scale=1.10, rc={"figure.figsize": (16.54, 11.7)})
DATASET_FILENAME = Path("results/activities_2022-01-29_16-33-05.csv")
print(f"{DATASET_FILENAME.stem = } {DATASET_FILENAME.suffix = }")
DATASET_FILENAME.stem = 'activities_2022-01-29_16-33-05' DATASET_FILENAME.suffix = '.csv'
Now, we load the dataset we'll use in this Notebook. It is generated using our tool in the same repository.
The dataset is a 106 * 66 matrix, with rows and columns indexed by three levels (using Pandas's MultiIndex) as follows:
The matrix is indexed by two disjoint finite sets of keywords:
An example is shown below, restricted to two compounds and two activities. Note that this is not an extract of the whole dataset, the (w/o, w/o) cells being smaller here.
| germination | germination | cytotoxicity | cytotoxicity | |||
|---|---|---|---|---|---|---|
| allelopathy | allelopathy | pharmaco | pharmaco | |||
| w/o | w/ | w/o | w/ | |||
| alkaloid | acridine | w/o | 2100 | 32 | 28 | 2104 |
| alkaloid | acridine | w/ | 1294 | 11 | 9 | 1296 |
| terpenoid/terpene | triterpene | w/o | 1283 | 11 | 9 | 1285 |
| terpenoid/terpene | triterpene | w/ | 2111 | 32 | 28 | 2115 |
Let $M$ be this matrix, and for each couple of keywords made of a compound and and activity, call $M_{ij} = (c_i, a_j)$, the ij confusion submatrix. Assume that $M_ij$ is of the form : \begin{bmatrix} U & V\\ X & Y \end{bmatrix}
Where :
We avoid the open world hypothesis by restricting the analysis to the paper in the domain $D$, which is the set of papers that have at least one compound and one activity. By construction:
dataset = pd.read_csv(DATASET_FILENAME, index_col=[0, 1, 2], header=[0, 1, 2])
# dataset.index.names = ["class", "name", "with"]
# dataset.columns.names = ["class", "name", "with"]
all_compounds = set(dataset.index.get_level_values(1))
all_activities = set(dataset.columns.get_level_values(1))
dataset
| abiotic | ... | pharmaco | toxicity | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| antioxidant | drought | metal | ... | cytotoxicity | sedative | toxicity | |||||||
| w/o | w/ | w/o | w/ | w/o | ... | w/ | w/o | w/ | w/o | w/ | |||
| alkaloid | acridine | w/o | 179092 | 62176 | 240191 | 1077 | 216324 | ... | 39229 | 238785 | 2483 | 215604 | 25664 |
| w/ | 2430 | 266 | 2694 | 2 | 2439 | ... | 1296 | 2688 | 8 | 2294 | 402 | ||
| benzylamine | w/o | 180754 | 62371 | 242046 | 1079 | 218089 | ... | 40400 | 240640 | 2485 | 217161 | 25964 | |
| w/ | 768 | 71 | 839 | 0 | 674 | ... | 125 | 833 | 6 | 737 | 102 | ||
| colchicine | w/o | 175968 | 62250 | 237143 | 1075 | 213101 | ... | 39268 | 235761 | 2457 | 213018 | 25200 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| terpenoid/terpene | sesterterpene | w/ | 182 | 7 | 189 | 0 | 189 | ... | 113 | 188 | 1 | 177 | 12 |
| tetraterpene/carotenoid/xanthophyll | w/o | 178534 | 54855 | 232655 | 734 | 208780 | ... | 40054 | 230901 | 2488 | 208151 | 25238 | |
| w/ | 2988 | 7587 | 10230 | 345 | 9983 | ... | 471 | 10572 | 3 | 9747 | 828 | ||
| triterpene | w/o | 177099 | 61285 | 237308 | 1076 | 213234 | ... | 38416 | 235915 | 2469 | 212731 | 25653 | |
| w/ | 4423 | 1157 | 5577 | 3 | 5529 | ... | 2109 | 5558 | 22 | 5167 | 413 | ||
106 rows × 66 columns
We can extract the "old" 53*33 matrix we use in the version version of this software, where we had only the with/with queries. In our small running example, that would be:
| germination | cytotoxicity | ||
|---|---|---|---|
| allelopathy | pharmaco | ||
| alkaloid | acridine | 11 | 1296 |
| terpenoid/terpene | triterpene | 32 | 2115 |
with_with_matrix = dataset.xs("w/", level=2).xs("w/", level=2, axis=1)
with_with_matrix
| abiotic | ... | pharmaco | toxicity | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| antioxidant | drought | metal | uv | salt | ... | wound | anticancer | cytotoxicity | sedative | toxicity | ||
| alkaloid | acridine | 266 | 2 | 257 | 80 | 163 | ... | 77 | 147 | 1296 | 8 | 402 |
| benzylamine | 71 | 0 | 165 | 23 | 80 | ... | 19 | 19 | 125 | 6 | 102 | |
| colchicine | 192 | 4 | 84 | 20 | 187 | ... | 222 | 171 | 1257 | 34 | 866 | |
| cyclopeptide | 57 | 1 | 168 | 16 | 35 | ... | 51 | 45 | 523 | 4 | 171 | |
| imidazole | 1082 | 8 | 2507 | 302 | 1195 | ... | 460 | 336 | 2816 | 486 | 1768 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| terpenoid/terpene | polyterpene | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 2 |
| sesquiterpene | 863 | 17 | 72 | 39 | 59 | ... | 124 | 114 | 2037 | 30 | 341 | |
| sesterterpene | 7 | 0 | 0 | 0 | 3 | ... | 2 | 6 | 113 | 1 | 12 | |
| tetraterpene/carotenoid/xanthophyll | 7587 | 345 | 592 | 392 | 513 | ... | 97 | 51 | 471 | 3 | 828 | |
| triterpene | 1157 | 3 | 51 | 46 | 59 | ... | 203 | 142 | 2109 | 22 | 413 | |
53 rows × 33 columns
We do some sanity checks explained earlier :
# sanity check #1
assert dataset.shape == (2 * len(all_compounds), 2 * len(all_activities))
dataset
# sanity check : group by summing on level 2 on both rows and cols produce a matrix of constants : the number of papers
submatrix_sum = dataset.groupby(level=1).sum().groupby(level=1, axis=1).sum()
number_of_papers = np.unique(submatrix_sum.values)
# if the Scopus collection did not evolve during while querying
assert len(number_of_papers) == 1
number_of_papers = number_of_papers[0]
print(f"The domain contains {number_of_papers} papers")
with_with_total = with_with_matrix.values.sum()
print(
f"Total number of positive/positive occurrences is {with_with_total} for {number_of_papers} papers (average={with_with_total/number_of_papers})"
)
The domain contains 243964 papers Total number of positive/positive occurrences is 383330 for 243964 papers (average=1.571256414880884)
acridine_cytotoxicity_submatrix = dataset.loc[
(
"alkaloid",
"acridine",
),
(
"pharmaco",
"cytotoxicity",
),
]
print(f"Among {number_of_papers} papers, there are")
for i, j in product(bex.SELECTORS, bex.SELECTORS):
print(f"{acridine_cytotoxicity_submatrix.loc[i,j]} papers {i} acridine and {j} cytotoxicity in their keywords")
print("The acridine and cytotoxicity confusion matrix is as follows")
acridine_cytotoxicity_submatrix
Among 243964 papers, there are 202039 papers w/o acridine and w/o cytotoxicity in their keywords 39229 papers w/o acridine and w/ cytotoxicity in their keywords 1400 papers w/ acridine and w/o cytotoxicity in their keywords 1296 papers w/ acridine and w/ cytotoxicity in their keywords The acridine and cytotoxicity confusion matrix is as follows
C:\Users\user\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\indexing.py:925: PerformanceWarning: indexing past lexsort depth may impact performance. return self._getitem_tuple(key)
| w/o | w/ | |
|---|---|---|
| w/o | 202039 | 39229 |
| w/ | 1400 | 1296 |
We compute marginal sums on rows and cols and add them to the orginial dataset.
NOTE we actually should do the other way around, that is to query Scopue for margin and for all (w/, w/) cells and then fill in the missing ones.
margin_idx = (bex.CLASS_SYMB, bex.MARGIN_SYMB, bex.SELECTORS[1])
margin_cols = dataset.groupby(level=1).sum().drop_duplicates().reset_index(drop=True)
margin_cols.index = pd.MultiIndex.from_tuples([margin_idx])
margin_rows = dataset.groupby(level=1, axis=1).sum().iloc[:, 0]
margin_rows.name = margin_idx
margin_rows = pd.DataFrame(margin_rows)
dataset_margins = dataset.copy()
dataset_margins[margin_idx] = margin_rows
dataset_margins = pd.concat([dataset_margins, margin_cols]).fillna(number_of_papers).astype(int)
dataset_margins
| * | abiotic | ... | pharmaco | toxicity | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Σ | antioxidant | drought | ... | sedative | wound | toxicity | |||||||
| w/ | w/ | w/o | w/ | w/o | ... | w/o | w/ | w/o | w/ | w/o | |||
| alkaloid | acridine | w/o | 241268 | 62176 | 179092 | 1077 | 240191 | ... | 238785 | 6707 | 234561 | 25664 | 215604 |
| w/ | 2696 | 266 | 2430 | 2 | 2694 | ... | 2688 | 77 | 2619 | 402 | 2294 | ||
| benzylamine | w/o | 243125 | 62371 | 180754 | 1079 | 242046 | ... | 240640 | 6765 | 236360 | 25964 | 217161 | |
| w/ | 839 | 71 | 768 | 0 | 839 | ... | 833 | 19 | 820 | 102 | 737 | ||
| colchicine | w/o | 238218 | 62250 | 175968 | 1075 | 237143 | ... | 235761 | 6562 | 231656 | 25200 | 213018 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| terpenoid/terpene | tetraterpene/carotenoid/xanthophyll | w/o | 233389 | 54855 | 178534 | 734 | 232655 | ... | 230901 | 6687 | 226702 | 25238 | 208151 |
| w/ | 10575 | 7587 | 2988 | 345 | 10230 | ... | 10572 | 97 | 10478 | 828 | 9747 | ||
| triterpene | w/o | 238384 | 61285 | 177099 | 1076 | 237308 | ... | 235915 | 6581 | 231803 | 25653 | 212731 | |
| w/ | 5580 | 1157 | 4423 | 3 | 5577 | ... | 5558 | 203 | 5377 | 413 | 5167 | ||
| * | Σ | w/ | 243964 | 62442 | 181522 | 1079 | 242885 | ... | 241473 | 6784 | 237180 | 26066 | 217898 |
107 rows × 67 columns
The https://en.wikipedia.org/wiki/Confusion_matrix can be equipped with a lot of different metrics (or score) that aggregate the $2 \times 2$ matrix to a single scalar value. One of the best is the https://en.wikipedia.org/wiki/Phi_coefficient
However, we are not exactly in the context of a confusion table. Moreover we do not have argument to provilege one metric over another. So we explore a part of the space.
We may analyze the "influence" of each margin to the score to design our own
def tt_projection_metric(arr):
"""The trivial one. Just keep the bottom right cell. Amount to compute on "the old matrix" """
[FF, FT], [TF, TT] = arr.reshape(2, 2)
return TT
# def intersection_metric(arr):
# """The trivial one. Just keep the bottom right cell. Amount to compute on "the old matrix" """
# [FF, FT], [TF, TT] = arr.reshape(2, 2)
# return TT / (FF + FT + TF + TT)
def row_implication_metric(arr):
"""The % of paper that have both keywords among the first one. Values add to 1 row wise"""
[FF, FT], [TF, TT] = arr.reshape(2, 2)
return TT / (TF + TT)
def col_implication_metric(arr):
"""The % of paper that have both keywords among the second one. Values add to 1 col wise"""
[FF, FT], [TF, TT] = arr.reshape(2, 2)
return TT / (FT + TT)
def fowlkes_mallows_metric(arr):
"""Computes the Fowlkes-Mallows index: sqrt(the product of row and col implication)"""
[FF, FT], [TF, TT] = arr.reshape(2, 2)
return TT / np.sqrt((FT + TT) * (TF + TT))
def accuracy_metric(arr):
"""Computes the accuracy"""
[FF, FT], [TF, TT] = arr.reshape(2, 2)
return (TT + FF) / (FF + FT + TF + TT)
def x_metric(arr):
"""Some another one by our own"""
[FF, FT], [TF, TT] = arr.reshape(2, 2)
return (FF + TT - FT - TF) / (FF + FT + TF + TT)
def fraction_metric(arr):
"""The % of paper that have both keywords among the ones having at least one."""
[FF, FT], [TF, TT] = arr.reshape(2, 2)
return TT / (FT + TF - TT)
metrics = [
tt_projection_metric,
row_implication_metric,
col_implication_metric,
fowlkes_mallows_metric,
# accuracy_metric, # CHECK if BUG
# x_metric, # CHECK if BUG
fraction_metric,
]
print("An example on the acridine/cytotoxicity submatrix, its score for each metric")
print(acridine_cytotoxicity_submatrix)
for metric in metrics:
print(f"{metric.__name__:<22} = {metric(acridine_cytotoxicity_submatrix.values)}")
An example on the acridine/cytotoxicity submatrix, its score for each metric
w/o w/
w/o 202039 39229
w/ 1400 1296
tt_projection_metric = 1296
row_implication_metric = 0.4807121661721068
col_implication_metric = 0.03198025909932141
fowlkes_mallows_metric = 0.12398911091858036
fraction_metric = 0.03294943177484555
Remark we'd love to say that "acridine partly implies cytotoxicity " ! Among 33 activities, about 48% of papers about acridine are about cytotoxicity ! Important look for implication rules ?
First, we have some fun with Numpy
# redimension the values to a 4D array
C, A = len(all_compounds), len(all_activities)
print(C, A)
M_2_2 = np.moveaxis(dataset.values.reshape((C, 2, A, 2)), 1, -2)
# 1D is easier for apply_along_axis
M_4 = M_2_2.reshape((C * A, 4))
print(f"{M_2_2.shape = }")
print(f"{M_4.shape = }")
# M.sum(axis=(2,3)) or similarly
np_nb_paper = np.sum(M_2_2, axis=(2, 3), keepdims=False)
print(f"{np.all(np_nb_paper == number_of_papers) = } (with {number_of_papers = })")
53 33 M_2_2.shape = (53, 33, 2, 2) M_4.shape = (1749, 4) np.all(np_nb_paper == number_of_papers) = True (with number_of_papers = 243964)
We obtain the same as submatrix_sum
score_df = {}
for metric in metrics:
metric_name = metric.__name__
matrix = np.apply_along_axis(metric, 1, M_4).reshape((C, A))
score_df[metric_name] = pd.DataFrame(matrix, index=with_with_matrix.index, columns=with_with_matrix.columns)
# print(score_df)
filename = Path(f"{DATASET_FILENAME.stem}_{metric_name}{DATASET_FILENAME.suffix}")
score_df[metric_name].to_csv(Path("results") / filename)
scores_summary_df = pd.DataFrame.from_dict(
{f_name: [df.values.min(), df.values.mean(), df.values.max(), df.values.std()] for f_name, df in score_df.items()},
orient="index",
columns=["min", "mean", "max", "std"],
)
scores_summary_df
| min | mean | max | std | |
|---|---|---|---|---|
| tt_projection_metric | 0.000 | 219.171 | 27855.000 | 1081.415 |
| row_implication_metric | 0.000 | 0.038 | 1.000 | 0.081 |
| col_implication_metric | 0.000 | 0.022 | 0.583 | 0.050 |
| fowlkes_mallows_metric | 0.000 | 0.018 | 0.504 | 0.035 |
| fraction_metric | 0.000 | 0.008 | 0.957 | 0.036 |
Here, we implement Correspondance Analysis from scratch using SVD from from scipy.linalg.
We explore the results on all previous metrics.
# sanity checks
def sanity_scores(all_dfs):
for metric_name, df in all_dfs.items():
# The input matrix: the dataset after a metric is applied
M = df.values
N = np.sum(M)
# Z is M normalised
Z = M / N
print(f"{metric_name} (grand total {N = })")
R = np.sum(M, axis=1) # X @ np.ones(X.shape[1])
C = np.sum(M, axis=0) # Z @ np.ones(X.shape[0])
print(f" {np.sum(R) = :.4f} and {np.sum(C) = :.4f}")
r = np.sum(Z, axis=1)
c = np.sum(Z, axis=0)
# print(f" {r[:, np.newaxis].shape = } {c[:, np.newaxis].shape = }")
print(
f" {np.sum(r) = :.4f} and {np.sum(c) = :.4f}"
) # \n{100*r = } \n{100*c = }
sanity_scores(score_df)
tt_projection_metric (grand total N = 383330)
np.sum(R) = 383330.0000 and np.sum(C) = 383330.0000
np.sum(r) = 1.0000 and np.sum(c) = 1.0000
row_implication_metric (grand total N = 65.7895272917367)
np.sum(R) = 65.7895 and np.sum(C) = 65.7895
np.sum(r) = 1.0000 and np.sum(c) = 1.0000
col_implication_metric (grand total N = 38.76436344379413)
np.sum(R) = 38.7644 and np.sum(C) = 38.7644
np.sum(r) = 1.0000 and np.sum(c) = 1.0000
fowlkes_mallows_metric (grand total N = 31.23157570524735)
np.sum(R) = 31.2316 and np.sum(C) = 31.2316
np.sum(r) = 1.0000 and np.sum(c) = 1.0000
fraction_metric (grand total N = 14.778185526117818)
np.sum(R) = 14.7782 and np.sum(C) = 14.7782
np.sum(r) = 1.0000 and np.sum(c) = 1.0000
def normalize(df, *, method="laplace"):
M = df.values
# normalize by grand total
Z = M / np.sum(M)
# margins
r = np.sum(Z, axis=1)
c = np.sum(Z, axis=0)
# Dc = np.diag(c)
# Dr = np.diag(r)
# center Z
Zc = Z - (r[:, np.newaxis] @ (c[:, np.newaxis].T))
# normalize
if method == "laplace":
S = np.diag(r ** (-0.5)) @ Zc @ np.diag(c ** (-0.5))
elif method == "rows":
S = np.diag(r ** (-1)) @ Zc
elif method == "cols":
S = Zc @ np.diag(c ** (-1))
else:
raise ValueError(f"unknown normalization {method = }")
return S, r, c
for metric_name, matrix in score_df.items():
S, r, c = normalize(matrix, method="laplace")
print(f"{metric_name} {S.shape}, margins:")
# print(S)
print(f"rows sum is {np.sum(S, axis=1).sum()}\n{np.sum(S, axis=1)}")
print(f" cols sum is {np.sum(S, axis=0).sum()}\n{np.sum(S, axis=0)}")
tt_projection_metric (53, 33), margins: rows sum is 0.9753857591248027 [-0.0141 0.0227 0.1352 0.0869 0.1013 0.0759 0.0174 0.0249 0.0879 0.0123 0.0221 0.0139 0.1372 0.0719 0.0801 0.1284 0.0072 0.0378 0.0289 0.0399 0.0109 0.0495 0.0251 0.0725 0.0342 0.004 -0.0041 -0.0395 -0.2416 -0.0013 -0.026 -0.0136 -0.2169 -0.0639 0.0378 -0.0699 -0.0834 -0.0359 -0.0182 -0.0076 0.0642 0.0276 0.0121 0.1107 0.008 0.0505 0.0995 0.0232 -0.0022 0.0955 -0.01 -0.0398 0.0061] cols sum is 0.975385759124803 [-0.5901 0.1177 -0.1521 -0.1046 0.0796 0.0528 0.0124 0.0946 0.0299 0.0409 0.0355 0.0132 0.0462 0.0098 -0.0081 0.0172 -0.0529 0.1168 0.0406 0.0074 -0.0242 0.1552 -0.0848 0.0699 0.1968 0.1045 0.1936 -0.0301 -0.0179 0.0691 0.4415 -0.0063 0.1012] row_implication_metric (53, 33), margins: rows sum is 0.015783114449158076 [-0.0639 0.0319 0.0919 0.0936 0.0357 0.0153 0.0673 -0.0074 0.0803 0.0665 0.003 0.0803 0.1136 0.0294 0.1014 0.0585 -0.0104 -0.0023 -0.0044 0.0062 -0.008 0.0566 0.004 0.0058 0.0786 -0.0167 -0.0329 -0.1067 -0.0804 -0.1079 -0.0662 -0.0625 -0.0608 -0.0792 0.0293 -0.0712 -0.056 -0.0872 -0.1419 -0.0933 0.0305 0.0133 0.0287 0.0667 -0.0453 0.1295 0.1397 0.0535 -0.1292 0.0207 -0.1195 -0.0339 -0.0285] cols sum is 0.015783114449157924 [-0.0446 0.0038 0.0159 0.0016 0.0105 -0.0047 -0.0021 -0.0042 -0.0043 -0.0021 0.0037 -0.0006 -0.0038 -0.0008 0.0008 -0.0007 -0.0117 0.0263 0.0045 -0.0003 0.0252 -0.0098 0. -0.0003 0.0008 -0.017 0.0072 0. -0.0003 -0.0055 -0.0119 0.008 0.0324] col_implication_metric (53, 33), margins: rows sum is 0.04958954865694797 [-0.0028 0.0009 0.0125 0.0046 0.0024 0.002 -0.0003 -0.0033 0.0048 0.001 -0.0045 -0.0002 0.0019 0.002 0.0059 0.021 -0.0011 0.0046 0.0035 -0.0088 -0.0009 0.0031 0.0126 -0.0037 0.0096 0.0039 0.0127 -0.0048 -0.0263 -0.0006 0.0006 -0.0017 -0.0079 -0.0102 0.0022 -0.0036 -0.0233 -0.0081 -0.0022 -0.0019 -0.0098 0.0037 0.0065 -0.0145 -0.0041 0.0078 0.0169 0.0068 -0.0003 0.0361 -0.0016 0.0017 0.0051] cols sum is 0.04958954865694818 [-0.2441 0.0398 -0.1525 -0.1759 -0.0035 0.1306 -0.0409 0.301 -0.0607 0.1283 0.3658 0.0449 -0.0469 -0.0558 -0.1571 0.0007 -0.0941 0.0382 0.0001 -0.0502 -0.065 0.044 -0.1113 0.1111 0.1348 0.0173 0.0916 -0.1309 -0.0819 0.0439 0.1412 -0.0903 -0.0227] fowlkes_mallows_metric (53, 33), margins: rows sum is 0.3288626363437355 [-0.0397 0.0044 0.0299 0.0399 -0.0292 0.0031 0.0432 -0.0353 0.0227 -0.0133 -0.0377 0.0232 0.0149 -0.0187 0.1091 0.0408 -0.0289 0.1275 -0.0193 -0.043 0.0053 0.061 0.0599 -0.0587 0.0952 -0.0119 0.0004 -0.0721 -0.1137 -0.0205 -0.0412 -0.0441 -0.0563 -0.0368 0.0664 -0.0736 -0.0858 -0.0777 -0.0435 -0.0485 -0.0245 -0.001 0.0195 -0.0125 -0.0123 0.1755 0.2255 0.0917 -0.0231 0.1794 -0.0347 0.0432 0.0051] cols sum is 0.3288626363437359 [-0.1992 0.1758 -0.0904 -0.0422 0.0221 0.0805 0.0123 0.186 0.0302 0.0303 0.1545 -0.003 0.0817 -0.0461 -0.0176 0.1255 -0.0662 0.0223 -0.0374 -0.0392 -0.0128 0.0095 -0.1169 0.0196 0.076 0.0007 -0.0432 -0.085 -0.0797 0.0156 0.1715 -0.0893 0.0829] fraction_metric (53, 33), margins: rows sum is 3.2770501096460447 [ 0.0571 0.1033 0.0433 0.101 -0.0102 0.0073 0.2 0.0468 0.0784 0.0097 0.0426 0.1196 0.0942 0.0419 0.248 0.0068 0.03 0.3531 0.0505 -0.0469 0.131 0.1885 0.1643 -0.0601 0.2101 0.0607 0.0453 -0.0045 -0.4927 -0.0022 0.041 0.04 -0.4435 0.054 0.2094 -0.0142 -0.0744 0.0093 0.0795 0.0035 0.0037 0.0831 0.1209 -0.0768 0.0672 0.3459 0.4423 0.2844 -0.0025 0.1488 0.1056 -0.0109 0.044 ] cols sum is 3.277050109646045 [-0.8841 0.5308 -0.1696 0.0731 -0.0664 0.3974 0.0472 0.5009 0.2798 0.2061 0.3305 0.0443 0.36 0.1277 0.0763 0.5009 0.0546 0.0809 0.1091 0.0156 -0.0845 0.0928 -0.236 0.2277 0.1887 0.0506 0.0554 0.0679 0.0028 0.1906 0.0977 0.0229 -0.015 ]
# the number of dimensions we project onto
NAXIS = 2
NEXAMPLES = 10
# TODO use the standard scipy API
@dataclasses.dataclass
class CA:
rows_coordinates: npt.NDArray
cols_coordinates: npt.NDArray
inertias: npt.NDArray
def correspondence_analysis(df) -> CA:
# normalize
S, r, c = normalize(df, method="laplace")
# decompose
U, D, Vt = linalg.svd(S)
# diagonal of sqrt of "eigenvalues"
Da = np.eye(U.shape[0], Vt.shape[0]) * D
# print(f"{ U.shape, D.shape, Da.shape, Vt.shape = }")
# SVD ensures that Vt @ Vt.T == I == U.T @ U
assert np.allclose(Vt @ Vt.T, np.identity(Vt.shape[0]))
assert np.allclose(U.T @ U, np.identity(U.shape[0]))
# BUG/TODO : this depends on the normalization method !
r_std_coords = np.diag(r ** (-0.5)) @ U
r_coords = r_std_coords @ Da
# print(f"{r_coords.shape}, first {NEXAMPLES} : {r_coords[:NEXAMPLES, NAXIS]}")
c_std_coords = np.diag(c ** (-0.5)) @ Vt.T
c_coords = c_std_coords @ Da.T
# print(f"{c_coords.shape}, first {NEXAMPLES} : {c_coords[:NEXAMPLES, NAXIS]}")
# smallest dimension
K = min(S.shape)
principal_inertias = np.diag(Da) ** 2
# print(f"(eigenvalues) {principal_inertias = }")
total_inertia = np.trace(S @ S.T)
# print(f"{total_inertia = }")
explained_inertias = 100 * principal_inertias / total_inertia
# print(f"(in %) {explained_inertias = }")
return CA(r_coords, c_coords, principal_inertias)
for m_name, df in score_df.items():
res = correspondence_analysis(df)
print(m_name)
print(f" total inertia {res.inertias.sum()} ({res.inertias.shape})")
print(
f" explained inertias on first {NAXIS} axis (%) = {100*res.inertias[:NAXIS].sum()/res.inertias.sum():.4f} {100*res.inertias[:NAXIS]/res.inertias.sum()}"
)
tt_projection_metric
total inertia 0.6768601446110587 ((33,))
explained inertias on first 2 axis (%) = 52.5342 [34.2743 18.2599]
row_implication_metric
total inertia 1.213574590857687 ((33,))
explained inertias on first 2 axis (%) = 34.8698 [17.9887 16.8811]
col_implication_metric
total inertia 1.5098879411451203 ((33,))
explained inertias on first 2 axis (%) = 34.2811 [19.7614 14.5197]
fowlkes_mallows_metric
total inertia 1.1359045817440268 ((33,))
explained inertias on first 2 axis (%) = 30.0285 [18.8789 11.1496]
fraction_metric
total inertia 2.091744214063845 ((33,))
explained inertias on first 2 axis (%) = 34.8782 [19.75 15.1283]
Now, we draw compounds and activities as scatterplots on the first 2 dimensions.
import numbers
def data_to_ca(df, r_coords, c_coords, axis=None):
# r_coords, c_coords, inertias = dataclasses.astuple(correspondence_analysis(df))
if axis is None:
axis = [0, 1]
elif isinstance(axis, numbers.Number):
axis = [axis]
else:
raise TypeError(f"cannot convert axis type {axis}")
dfs = []
if 0 in axis:
rows_ca = pd.DataFrame(index=df.index.droplevel())
rows_ca["1st axis"] = r_coords[:, 0]
rows_ca["2nd axis"] = -r_coords[:, 1]
rows_ca["class"] = df.index.droplevel(1)
rows_ca["mass"] = np.sum(df.values, axis=1)
rows_ca["type"] = "Compound"
dfs.append(rows_ca)
if 1 in axis:
cols_ca = pd.DataFrame(index=df.columns.droplevel())
cols_ca["1st axis"] = c_coords[:, 0]
cols_ca["2nd axis"] = -c_coords[:, 1]
cols_ca["class"] = df.columns.droplevel(1)
cols_ca["mass"] = np.sum(df.values, axis=0)
cols_ca["type"] = "Activity"
dfs.append(cols_ca)
full_ca = pd.concat(dfs, axis=0)
return full_ca
# df = score_df["tt_projection_metric"]
# ca_res = correspondence_analysis(df)
# plot_ca = data_to_ca(df, ca_res.rows_coordinates, ca_res.cols_coordinates, axis=None)
# plot_ca
SHIFT = 0.00
for f_name, df in score_df.items():
ca_res = correspondence_analysis(df)
axis = None
if f_name == "row_implication_metric":
axis = 0
if f_name == "col_implication_metric":
axis = 1
plot_ca = data_to_ca(df, ca_res.rows_coordinates, ca_res.cols_coordinates, axis=axis)
ax = sns.scatterplot(data=plot_ca, x="1st axis", y="2nd axis", hue="class", size="mass", sizes=(64, 512))
ax.set_title(
f"CA for {f_name}, captured inertia={100*ca_res.inertias[:2].sum():.2f}% {f'(only {axis = })' if axis is not None else ''}"
)
ax.set_xlabel(f"1st axis ({100*ca_res.inertias[0]:.2f})")
ax.set_ylabel(f"2nd axis ({100*ca_res.inertias[1]:.2f})")
for index, row in plot_ca.iterrows():
ax.annotate(index, (row["1st axis"] + SHIFT, row["2nd axis"] + SHIFT))
plt.show()
def clean_implications(axis=0):
if axis == 0:
f_name = "row_implication_metric"
elif axis == 1:
f_name = "col_implication_metric"
else:
raise IndexError(f"no such axis {axis}")
df = score_df[f_name].copy()
df_support = (
score_df["tt_projection_metric"]
.melt(ignore_index=False, value_name="value")
.reset_index()
.drop(columns=["level_0", "variable_0"], axis=1)
.rename(columns={"level_1": "compound", "variable_1": "activity"})
.set_index(["compound", "activity"])
)
df.index = df.index.droplevel(0)
df.columns = df.columns.droplevel(0)
df = df.melt(ignore_index=False, value_name="%").reset_index()
df.set_index(["index", "variable"], inplace=True)
df.index.set_names(["compound", "activity"], inplace=True)
df["mass"] = df_support["value"]
# df["support"] = df_support["value"] / df["%"]
df.sort_values(by="%", ascending=False, inplace=True)
return df
# clean_implications(axis = 0)
print("Best implication rules : compound => activity")
clean_implications(0).head(15)
Best implication rules : compound => activity
| % | mass | ||
|---|---|---|---|
| compound | activity | ||
| isoflavanoids | antibacterial | 1.000 | 2 |
| phenolic acid | antioxidant | 0.759 | 1825 |
| acetogenin | cytotoxicity | 0.724 | 228 |
| tetraterpene/carotenoid/xanthophyll | antioxidant | 0.717 | 7587 |
| pyrrolizidine | toxicity | 0.675 | 469 |
| polyterpene | toxicity | 0.667 | 2 |
| polyene | antifungal | 0.636 | 909 |
| flavonoids | antioxidant | 0.617 | 25763 |
| sesterterpene | cytotoxicity | 0.598 | 113 |
| biflavonoids | antioxidant | 0.531 | 400 |
| phenol | antioxidant | 0.517 | 27855 |
| acridine | cytotoxicity | 0.481 | 1296 |
| stilbene | antioxidant | 0.474 | 2475 |
| tannin | antioxidant | 0.474 | 5276 |
| phenylpropanoid | antioxidant | 0.428 | 357 |
print("Best implication rule : activity => compound")
clean_implications(1).swaplevel().head(15)
Best implication rule : activity => compound
| % | mass | ||
|---|---|---|---|
| activity | compound | ||
| toxicant | phenol | 0.583 | 70 |
| arbuscula | sesquiterpene | 0.455 | 5 |
| antioxidant | phenol | 0.446 | 27855 |
| flavonoids | 0.413 | 25763 | |
| uv | phenol | 0.398 | 2614 |
| repulsive | phenol | 0.367 | 22 |
| phytotoxicity | phenol | 0.335 | 413 |
| antidiabetic | flavonoids | 0.334 | 1967 |
| sedative | pyridine | 0.326 | 811 |
| drought | tetraterpene/carotenoid/xanthophyll | 0.320 | 345 |
| germination | phenol | 0.320 | 782 |
| salt | thiazole | 0.314 | 4202 |
| antimicrobial | tetracycline | 0.312 | 6811 |
| burns | tetracycline | 0.309 | 371 |
| metal | phenol | 0.309 | 7791 |
import holoviews as hv
hv.extension("bokeh")
# hv.extension('matplotlib')
sankey_data = (
score_df["tt_projection_metric"]
.melt(ignore_index=False, value_name="weight")
.reset_index()
.drop(columns=["level_0", "variable_0"])
.rename(columns={"level_1": "compound", "variable_1": "activity"})
.fillna(0)
)
sankey_threshold = number_of_papers // 100
print(
f"filtering couples (compound, activity) that have at least {sankey_threshold} papers (out of {number_of_papers})"
)
# https://holoviews.org/reference_manual/holoviews.plotting.bokeh.html#sankey-module
sankey_values = sankey_data[sankey_data["weight"] > sankey_threshold]
sankey_diagram = hv.Sankey(sankey_values.values)
sankey_diagram.opts(label_position="outer", width=1200, height=800)
filtering couples (compound, activity) that have at least 2439 papers (out of 243964)
compounds_classes_graph = (
margin_rows.xs("w/", level=2).reset_index().rename(columns={"level_0": "c-class", "level_1": "compound"})
)
activities_classes_graph = (
margin_cols.transpose()
.xs("w/", level=2)
.swaplevel()
.reset_index()
.rename(columns={"level_1": "a-class", "level_0": "activity"})
)
# renommage pour éviter les éponymes
compounds_classes_graph["c-class"] = compounds_classes_graph["c-class"].astype(str) + "(class)"
# renommage pour éviter les éponymes
activities_classes_graph["a-class"] = activities_classes_graph["a-class"].astype(str) + "(class)"
# de toutes les classes, on ne garde que celles au dessus du seuil
c_class = compounds_classes_graph[compounds_classes_graph["compound"].isin(sankey_values["compound"])]
a_class = activities_classes_graph[activities_classes_graph["activity"].isin(sankey_values["activity"])]
sankey = hv.Sankey(c_class.values.tolist() + sankey_values.values.tolist() + a_class.values.tolist())
sankey.opts(width=1200, height=800, node_color='index')